I. Data visualization at a glance

Anatomy of Facebook by Facebook Data Science Team

Seeking safety by The Economist Data Team

II. Why doing plots in R?

R

RStudio

i. Explore before you analyze

ii. Present after you analyze

iii. More powerful than excel/ SPSS/ etc…

III. Base R review:

Data frame: think like a matrix

Packages: tools that upgrade R power

install.packages("ggplot2")
install.packages("lattice")

IV. Making plots with R (lattice v.s. ggplot2)

  1. faster (though only noticeable over many and large plots)

  2. simpler (at first)

  3. better at trellis graphs

  4. able to do 3d graphs

  1. generally more elegant

  2. more syntactically logical (and therefore simpler, once you learn it)

  3. better at grouping

  4. able to interface with maps

  5. well documented [http://docs.ggplot2.org/current/]

Basic usage: lattice

library(lattice)

The general call for lattice graphics looks something like this:

graph_type(formula, data=, [options])

The specifics of the formula differ for each graph type, but the general format is straightforward

y             # Show the distribution of y

y~x           # Show the relationship between x and y 

y~x|A         # Show the relationship between x and y conditional on the values of A

y~x|A*B       # Show the relationship between x and y conditional on the combinations of A and B

z~y*x         # Show the 3D relationship between x, y, and z

Basic usage: ggplot2

library(ggplot2)

The general call for ggplot2 graphics looks something like this:

ggplot(data=, aes(x=,y=, [options])) + geom_xxxx() + ... + ... + ...

Note that ggplot2 graphs in layers in a continuing call (hence the endless +…+…+…), which really makes the extra layer part of the call

...+geom_xxxx(data=, aes(x=,y=,[options]),[options])+...+...+...

You can see the layering effect by comparing the same graph with different colors for each layer

ggplot(data=data, aes(x=year, y=realgdpgr)) + geom_point(color="black") + geom_point(aes(x=year, y=unemp), color="red")

ggplot(data=data, aes(x=year, y=realgdpgr)) + geom_point(color="red") + geom_point(aes(x=year, y=unemp), color="black")

i. Plot types

Scatter plot

ggplot(data=data, aes(x=year, y=outlays)) + geom_point() # ggplot2

xyplot(outlays~year, data=data) # lattice

Line plot

ggplot(data=data[data$country=="USA",], aes(x=year, y=outlays)) + geom_line() # ggplot2 

xyplot(outlays~year, data=data[data$country=="USA",], type="l") # lattice

Boxplot

ggplot(data=data, aes(x=country, y=outlays)) + geom_boxplot() # ggplot2

bwplot(outlays~country, data=data) # lattice

Bar chart/ density plot

# Create data.frame of average growth rates by country over time
library(plyr)
growth <- ddply(.data=data, .variables=.(country), summarize, mean=mean(realgdpgr, na.rm=T))

ggplot(data=growth, aes(x=country, y=mean)) + geom_bar(stat="identity") # ggplot2

barchart(mean~country, data=growth) # lattice

ggplot(data=data, aes(x=vturn)) + geom_density() # ggplot2

densityplot(~vturn, data=data) # lattice

2-d density

Contour plots
data(volcano) # Load volcano contour data
volcano[1:10, 1:10] # Examine volcano dataset (first 10 rows and columns)
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]  100  100  101  101  101  101  101  100  100   100
##  [2,]  101  101  102  102  102  102  102  101  101   101
##  [3,]  102  102  103  103  103  103  103  102  102   102
##  [4,]  103  103  104  104  104  104  104  103  103   103
##  [5,]  104  104  105  105  105  105  105  104  104   103
##  [6,]  105  105  105  106  106  106  106  105  105   104
##  [7,]  105  106  106  107  107  107  107  106  106   105
##  [8,]  106  107  107  108  108  108  108  107  107   106
##  [9,]  107  108  108  109  109  109  109  108  108   107
## [10,]  108  109  109  110  110  110  110  109  109   108
volcano3d <- reshape2::melt(volcano) # Use reshape2 package to melt the data
head(volcano3d) # Examine volcano3d dataset (head)
##   Var1 Var2 value
## 1    1    1   100
## 2    2    1   101
## 3    3    1   102
## 4    4    1   103
## 5    5    1   104
## 6    6    1   105
names(volcano3d) <- c("xvar", "yvar", "zvar") # Rename volcano3d columns

ggplot(data=volcano3d, aes(x=xvar, y=yvar, z = zvar)) + geom_contour() # ggplot2

contourplot(zvar~xvar + yvar, data=volcano3d) # lattice

tile/image/level plots
ggplot(data=volcano3d, aes(x=xvar, y=yvar, z = zvar)) + geom_tile(aes(fill=zvar)) # ggplot2

levelplot(zvar~xvar + yvar, data=volcano3d) # lattice

lattice: 3D plots

# Create a subset of the dataset containing only data for France
france.data <- data[data$country=="France",]
cloud(outlays~year*realgdpgr, data=france.data)

# Create a subset of the dataset containing only data for Greece, Portugal, Ireland, and Spain
pigs.data <- data[data$country %in% c("Greece", "Portugal", "Ireland", "Spain"),]
cloud(outlays~year*realgdpgr|country, data=pigs.data)

ggplot2: Add smoothing lines and curves

ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_point()

# Add linear model (lm) smoother
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_point() + 
  geom_smooth(method="lm")

# Add local linear model (loess) smoother, span of 0.75 
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_point() + 
  geom_smooth(method="loess", span=.75)

# Add local linear model (loess) smoother, span of 0.25 
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_point() + 
  geom_smooth(method="loess", span=.25)

# Add linear model (lm) smoother, no standard error shading 
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_point() + 
  geom_smooth(method="lm", se=F)

# Add local linear model (loess) smoother, no standard error shading 
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_point() + 
  geom_smooth(method="loess", se=F)

# Add a local linear (loess) smoother for each country
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_point(aes(color=country)) + 
  geom_smooth(aes(color=country))

# Add a local linear (loess) smoother for each country, no standard error shading
ggplot(data=pigs.data, aes(x=year, y=outlays)) +
  geom_point(aes(color=country, size=realgdpgr)) + 
  geom_smooth(aes(color=country), se=F)

ii. Plot layout

Title & axis

ggplot(data=data, aes(x=year, y=outlays))  +  geom_point() + 
xlab(label="Voter Turnout (%)") + ylab(label="Government Outlays") + 
ggtitle(label="Cool Graph") # ggplot2

xyplot(outlays~year, data=data, xlab="Year", ylab="Government Outlays", main
 ="Cool Graph") # lattice

legend

“trellis” plots

ggplot(data=data, aes(x=year, y=outlays)) + geom_point() + facet_wrap(~country) # ggplot2

xyplot(outlays~year|country, data=data) # lattice

Panel plot/ Table

  • Both lattice and ggplot2 graphs can be combined using the grid.arrange() function in the gridExtra package
# Initialize gridExtra library
library(gridExtra)
# Create 3 plots to combine in a table
plot1 <- ggplot(data=pigs.data, aes(x=year, y=outlays, color=)) + 
  geom_line(aes(color=country))
plot2 <- ggplot(data=pigs.data, aes(x=year, y=outlays, linetype=)) + 
  geom_line(aes(linetype=country))
plot3 <- ggplot(data=pigs.data, aes(x=year, y=outlays, shape=)) + 
  geom_point(aes(shape=country))
# Call grid.arrange
grid.arrange(plot1, plot2, plot3, nrow=3, ncol=1)

Exporting

Two basic image types

  1. Raster/Bitmap (.png, .jpeg)

Every pixel of a plot contains its own separate coding; not so great if you want to resize the image

jpeg(filename="example.png", width=, height=)
plot(x,y)
dev.off()
  1. Vector (.pdf, .ps)

Every element of a plot is encoded with a function that gives its coding conditional on several factors; great for resizing

pdf(filename="example.pdf", width=, height=)
plot(x,y)
dev.off()
Exporting with lattice v. ggplot
# Assume we saved our plot is an object called example.plot

# lattice
trellis.device(device="pdf", filename="example.pdf")
print(example.plot)
dev.off()

# ggplot2
ggsave(filename="example.pdf", plot=example.plot, scale=, width=, height=) # ggplot2

iii. ggplot2 and the Grammar of Graphics

  • By now, you might be noticing some trends in how these two packages approach graphics

  • lattice tends to focus on a particular type of graph and how to represent cross-sectional variation by splitting it up into smaller chunks

  • Becoming a proficient user of lattice requires learning a huge array of graph-specific formulas and options

  • ggplot2 tries to represent much more of the cross-sectional variation by making use of various “aesthetics”; general approach is based on The Grammar of Graphics

  • Basic idea is that the visualization of all data requires four items

  1. One or more statistics conveying information about the data (identities, means, medians, etc.)

  2. A coordinate system that differentiates between the intersections of statistics (at most two for ggplot, three for lattice)

  3. Geometries that differentiate between off-coordinate variation in kind

  4. Scales that differentiate between off-coordinate variation in degree

  • ggplot2 allows the user to manipulate all four of these items

V. Plotting in action

Hurricanes data

The National Hurricane Center (NHC) collects datasets with all storms around the globe.

For all storms, we have the location of the storm, every six hours (at midnight, six a.m., noon and six p.m.). Note that we have also the date, the maximal wind speed – on a 6 hour window – and the pressure in the eye of the storm. Based on the location of the storm, we also record the “Tropical basin”, a geological sub-group, for each storm.

For all major storms falling between 1989 to 2013, I add information on deaths and economic damages for major hurricanes over that period from Wikipedia. The information is stored in “AtlanticHurricanes.rda”.

You can download the data files at [https://www.ocf.berkeley.edu/~wcai/share/hurricanes/]

# Load packages
library(ggplot2)

# Load data without unziping
library(data.table)
filename <- "./hurricanes/hurricane.csv.bz2"
con <- bzfile(filename, open = 'r')
dt <- read.csv(con, na.strings = '', stringsAsFactors = FALSE)
# (optional) turn into data.table object
dt <- as.data.table(dt)

summary(dt)
##       V1                Season          Num           Basin          
##  Length:327253      Min.   :1842   Min.   : 1.00   Length:327253     
##  Class :character   1st Qu.:1938   1st Qu.: 5.00   Class :character  
##  Mode  :character   Median :1969   Median :10.00   Mode  :character  
##                     Mean   :1961   Mean   :11.89                     
##                     3rd Qu.:1991   3rd Qu.:16.00                     
##                     Max.   :2014   Max.   :61.00                     
##   Sub_basin             Name             ISO_time        
##  Length:327253      Length:327253      Length:327253     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##     Nature             Latitude        Longitude         Wind.kt      
##  Length:327253      Min.   :-999.0   Min.   :-999.0   Min.   :-999.0  
##  Class :character   1st Qu.:-999.0   1st Qu.:-999.0   1st Qu.:-999.0  
##  Mode  :character   Median : -17.1   Median :-100.2   Median :   0.0  
##                     Mean   :-413.0   Mean   :-398.7   Mean   :-400.1  
##                     3rd Qu.:  18.2   3rd Qu.: 103.0   3rd Qu.:  35.0  
##                     Max.   :  72.5   Max.   : 180.0   Max.   : 165.0  
##   Pressure.mb          V13                 V14               V15         
##  Min.   :-999.00   Length:327253      Min.   :-999.00   Min.   :-999.00  
##  1st Qu.:-999.00   Class :character   1st Qu.:-999.00   1st Qu.:-999.00  
##  Median :   0.00   Mode  :character   Median :-100.00   Median :-100.00  
##  Mean   : -21.23                      Mean   :-421.67   Mean   :-418.37  
##  3rd Qu.: 990.00                      3rd Qu.:  32.42   3rd Qu.:  33.27  
##  Max.   :9999.00                      Max.   : 100.00   Max.   : 100.00  
##      V16            Degrees_North     Deegrees_East    
##  Length:327253      Min.   :-68.500   Min.   :-180.00  
##  Class :character   1st Qu.:-14.100   1st Qu.:  39.63  
##  Mode  :character   Median : 14.200   Median :  92.40  
##                     Mean   :  7.452   Mean   :  61.81  
##                     3rd Qu.: 22.000   3rd Qu.: 132.11  
##                     Max.   : 72.500   Max.   : 180.00

What are you going to do with this data?

Filter out missing data and unnamed hurricanes

# Data cleaning
myNames <- c("Season", 
             "Num", "Basin", "Sub_basin", "Name", "ISO_time", "Nature",
             "Latitude", "Longitude", "Wind.kt", "Pressure.mb",
             "Degrees_North", "Deegrees_East")

dt <- dt[Latitude > -999 & Longitude > -999 & Wind.kt>0 & 
           !Name %in% c("UNNAMED", "NONAME:UNNAMED"),
         myNames, with=FALSE]

Remove white space, add storm ID and add months

myMonth <- function(x){
  time.date = strsplit(x, " ")
  iso.date = unlist(lapply(time.date, function(x) x[1]))
  iso.month = substr(iso.date, 6, 7)
  factor(iso.month, labels = c(month.name))  
}

dt[, `:=`(Basin = gsub(" ", "", Basin),
          ID = paste(Name, Season, sep = "."),
          Month = myMonth(ISO_time))]

Explore the distribution of the data

plot(table(dt$Season))

table(dt$Basin)
## 
##    EP    NA    NI    SA    SI    SP    WP 
## 22136 22054  3466    61 17597  9019 22039

Inclass Practice

1. Plot the relationship between “wind knots” and “month”

Does this fits with your intuition?

2. Create a “trellis” plot of “wind knots” v.s. “month”, conditional on “Basin” ; add a smoothing line to the plot.

3. Create a “trellis” plot of “wind knots” v.s. “month”, conditional on “Season” and “Basin”; Put “Season” on x-axis, “Basin” on Y-axis; Only use data after year 2009.

VI. Extra tips

1. Moderate font size (title/ legend/ axis): not too BIG or SMALL

2. Meaningful axis name

3. Clarity of plot (fix axis range)

4. Don’t include too many information (dimensions) in one plot! Split into multiple plots

PLUS: 10 tips for making your R graphics look their best (by Revolution R)

http://blog.revolutionanalytics.com/2009/01/10-tips-for-making-your-r-graphics-look-their-best.html

VII. Summary

1. Always explore data before fitting models!

2. ggplot (adds layer by layer) v.s. lattice (a function for each type of plot, use formula to explore high dimentional data)

3. The combination is limitless

For your own practice

Data: [https://www.ocf.berkeley.edu/~wcai/share/airlines/AirlineDataAll.csv]

You’re welcome to try out either lattice or ggplot for these questions, but in the solutions we’ll focus on the ggplot approach.

For some of these you may want to use a smaller version of the dataset, such as a random subset, subset <- air[sample(1:nrow(air), 10000, replace = FALSE), ].

Basics

  1. Plot a histogram of the flight delays with negative delays set to zero, censoring delay times at a maximum of 60 minutes.

  2. Plot the arrival delay against the departure delay as a scatterplot.

  3. Clean up your scatterplot with a title and axis labels. Output it as a PDF and see if you’d be comfortable with including it in a report/paper.

  4. Make a boxplot of the departure delay as a function of the day of week.

Using the ideas

  1. Create a trellis plot of departure delay boxplots, one per destination for this subset of destinations, DestSubset <- c('LAX','SEA','PHX','DEN','MSP','JFK','ATL','DFW','IAH', 'ORD'). Use a 2x5 layout of panels in the plot.

  2. Subset the data to flights going to Chicago (ORD) and Houston (IAH). Plot arrival delay against scheduled departure time (CRSDepTime). Now plot so that flights to Chicago are in one color and those to Houston in another. Use scale_x_continuous() and scale_y_continuous() to set the x-axis limits to be in the range from 6 am to midnight and the y-axis limits to be in the range (-10, 120).

Advanced

  1. Create a trellis plot where, for a given destination (see the subset in question 5), each panel uses a) hollow circles to plot departure delay as a function of time of day, and b) a red loess smoother without standard errors to plot the trend in departure delay over time of day. Limit the time of day shown to 6 am to midnight, and turn off the grey background. Figure out how to use partially-transparent points to reduce the effect of the overplotting of points.

Appendix: lattice v. ggplot2: options

[axis + size scaling]

ggplot(data=data, aes(x=year, y=outlays)) + geom_point() # ggplot2

ggplot(data=data, aes(x=year, y=outlays)) + geom_point(size=3) # ggplot2

ggplot(data=data, aes(x=year, y=outlays)) + geom_point(size=1) # ggplot2

xyplot(outlays~year, data=data) # lattice

xyplot(outlays~year, data=data, cex=2) # lattice

xyplot(outlays~year, data=data, cex=.5) # lattice

[graphical parameters]

  • Colors
ggplot(data=data, aes(x=year, y=outlays)) + geom_point(color=colors()[145]) # ggplot2

ggplot(data=data, aes(x=year, y=outlays)) + geom_point(color="red") # ggplot2

xyplot(outlays~year, data=data, col=colors()[145]) #lattice

xyplot(outlays~year, data=data, col="red") #lattice

  • Point Styles and Widths
ggplot(data=data, aes(x=year, y=outlays)) + geom_point(shape=3) # ggplot2

ggplot(data=data, aes(x=year, y=outlays)) + geom_point(shape=15) # ggplot2

xyplot(outlays~year, data=data, pch=3) # lattice

xyplot(outlays~year, data=data, pch=15) # lattice

  • Point Styles and Widths
ggplot(data=data, aes(x=year, y=outlays)) + geom_point(shape=3) # ggplot2

ggplot(data=data, aes(x=year, y=outlays)) + geom_point(shape=15) # ggplot2

ggplot(data=data, aes(x=year, y=outlays)) + geom_point(shape="w") # ggplot2

ggplot(data=data, aes(x=year, y=outlays)) + geom_point(shape="$", size=5) # ggplot2

xyplot(outlays~year, data=data, pch=3) # lattice

xyplot(outlays~year, data=data, pch=15) # lattice

xyplot(outlays~year, data=data, pch="w") # lattice

xyplot(outlays~year, data=data, pch="$", cex=2) # lattice

  • Line Styles and Widths
ggplot(data=data[data$country=="USA",], aes(x=year, y=outlays)) + 
geom_line(linetype=1) # ggplot2

ggplot(data=data[data$country=="USA",], aes(x=year, y=outlays)) + 
geom_line(linetype=2) # ggplot2

ggplot(data=data[data$country=="USA",], aes(x=year, y=outlays)) + 
geom_line(linetype=3) # ggplot2

ggplot(data=data[data$country=="USA",], aes(x=year, y=outlays)) + 
geom_line(linetype=3, size=1) # ggplot2

ggplot(data=data[data$country=="USA",], aes(x=year, y=outlays)) + 
geom_line(linetype=3, size=1.5) # ggplot2

ggplot(data=data[data$country=="USA",], aes(x=year, y=outlays)) + 
geom_line(linetype=3, size=2) # ggplot2

xyplot(outlays~year, data=data[data$country=="USA",], type="l", lty=1) # lattice

xyplot(outlays~year, data=data[data$country=="USA",], type="l", lty=2) # lattice

xyplot(outlays~year, data=data[data$country=="USA",], type="l", lty=3) # lattice

xyplot(outlays~year, data=data[data$country=="USA",], type="l", lty=3, lwd=2) # lattice

xyplot(outlays~year, data=data[data$country=="USA",], type="l", lty=3, lwd=3) # lattice

xyplot(outlays~year, data=data[data$country=="USA",], type="l", lty=3, lwd=4) # lattice

Appendix: Anatomy of aes()

ggplot(data=, aes(x=, y=, color=, linetype=, shape=, size=))

ggplot2 is optimized for showing variation on all four aesthetic types

# Differences in kind using color
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_line(aes(color=country))

Note what happens when we specify the color parameter outside of the aesthetic operator. ggplot2 views these specifications as invalid graphical parameters.

# ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_line(color=country)
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_line(color="country")
## Error in grDevices::col2rgb(colour, TRUE) : invalid color name 'country'
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_line(color="red")

# Differences in kind using line types
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_line(aes(linetype=country))

# Differences in kind using point shapes
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_point(aes(shape=country))

# Differences in degree using color
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_point(aes(color=realgdpgr))

# Differences in degree using point size
ggplot(data=pigs.data, aes(x=year, y=outlays)) + geom_point(aes(size=realgdpgr))

# Multiple non-cartesian aesthetics (differences in kind using color, degree using point size)
ggplot(data=pigs.data, aes(x=year, y=outlays)) + 
  geom_point(aes(color=country,size=realgdpgr))